2. Feature Preparation
2.1 Select Numeric Features
# Core numeric features for PCA/clustering
numeric_features <- c(
"hp", "attack", "defense", "sp_atk", "sp_def", "speed",
"height_m", "weight_kgs", "bmi"
)
# Create analysis dataframe
pca_df <- pokemon_df |>
select(dex, name, category, generation, type_1, all_of(numeric_features)) |>
drop_na()
cat(glue("
Analysis dataset: {nrow(pca_df)} Pokemon
Features: {length(numeric_features)} numeric variables
"))
## Analysis dataset: 742 Pokemon
## Features: 9 numeric variables
head(pca_df)
## # A tibble: 6 × 14
## dex name category generation type_1 hp attack defense sp_atk sp_def
## <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 3 Venusaur Regular Gen 1 Grass 80 82 83 100 100
## 2 3 Mega Venu… Regular Gen 1 Grass 80 100 123 122 120
## 3 6 Charizard Regular Gen 1 Fire 78 84 78 109 85
## 4 6 Mega Char… Regular Gen 1 Fire 78 104 78 159 115
## 5 6 Mega Char… Regular Gen 1 Fire 78 130 111 130 85
## 6 9 Blastoise Regular Gen 1 Water 79 83 100 85 105
## # ℹ 4 more variables: speed <dbl>, height_m <dbl>, weight_kgs <dbl>, bmi <dbl>
2.3 Outlier Detection and Analysis
cat("=== OUTLIER DETECTION ===\n\n")
## === OUTLIER DETECTION ===
# Function to detect outliers using IQR method
detect_outliers <- function(x, multiplier = 3) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - multiplier * iqr
upper <- q3 + multiplier * iqr
return(x < lower | x > upper)
}
# Extract numeric data for outlier detection
numeric_data_temp <- pca_df |> select(all_of(numeric_features))
# Identify outliers across all numeric features
outlier_flags <- numeric_data_temp |>
transmute(across(everything(), ~ detect_outliers(.), .names = "{.col}_outlier"))
# Count outliers per Pokemon
pca_df$outlier_count <- rowSums(outlier_flags)
# Identify extreme outliers (outliers in 3+ features)
extreme_outliers <- pca_df |>
filter(outlier_count >= 3) |>
arrange(desc(outlier_count))
cat(glue("
Extreme outliers (3+ features affected): {nrow(extreme_outliers)}\n\n"))
## Extreme outliers (3+ features affected): 4
if (nrow(extreme_outliers) > 0) {
outlier_summary <- extreme_outliers |>
select(
dex, name, category, outlier_count, hp, attack, defense,
height_m, weight_kgs, bmi
) |>
head(10)
print(knitr::kable(outlier_summary,
caption = "Top Extreme Outliers",
digits = 1
))
}
##
##
## Table: Top Extreme Outliers
##
## | dex|name |category | outlier_count| hp| attack| defense| height_m| weight_kgs| bmi|
## |---:|:------------|:-----------|-------------:|---:|------:|-------:|--------:|----------:|----:|
## | 208|Steelix |Regular | 3| 75| 85| 200| 9.2| 400| 4.7|
## | 208|Mega Steelix |Regular | 3| 75| 125| 230| 10.5| 740| 6.7|
## | 799|Guzzlord |Ultra Beast | 3| 223| 101| 53| 5.5| 888| 29.4|
## | 805|Stakataka |Ultra Beast | 3| 61| 131| 211| 5.5| 820| 27.1|
# Visualize outlier distribution
ggplot(pca_df, aes(x = outlier_count)) +
geom_bar(fill = "steelblue", alpha = 0.7) +
labs(
title = "Distribution of Outlier Counts per Pokemon",
subtitle = "Number of features where Pokemon is an outlier (>3 IQR)",
x = "Number of Outlier Features",
y = "Count"
) +
theme_minimal()

# Box plots for key features showing outliers
key_features <- c("hp", "attack", "defense", "height_m", "weight_kgs", "bmi")
pca_df |>
select(all_of(key_features)) |>
pivot_longer(everything(), names_to = "Feature", values_to = "Value") |>
ggplot(aes(x = Feature, y = Value)) +
geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.size = 2) +
facet_wrap(~Feature, scales = "free", ncol = 3) +
labs(
title = "Box Plots Showing Outliers in Key Features",
subtitle = "Red points indicate outliers"
) +
theme_minimal() +
theme(axis.text.x = element_blank())

2.4 Outlier Treatment Strategy
cat("\n=== OUTLIER TREATMENT OPTIONS ===\n\n")
##
## === OUTLIER TREATMENT OPTIONS ===
# Option 1: Analyze with all data
pca_df_full <- pca_df
# Option 2: Remove extreme outliers (3+ features)
pca_df_clean <- pca_df |>
filter(outlier_count < 3)
# Option 3: Winsorize extreme values (cap at 99th percentile)
pca_df_winsorized <- pca_df
for (col in numeric_features) {
p99 <- quantile(pca_df[[col]], 0.99, na.rm = TRUE)
p01 <- quantile(pca_df[[col]], 0.01, na.rm = TRUE)
pca_df_winsorized[[col]] <- pmax(pmin(pca_df[[col]], p99), p01)
}
cat(glue("
Dataset Sizes:
"))
## Dataset Sizes:
# Highlight removed outliers
if (nrow(extreme_outliers) > 0) {
cat("\nRemoved extreme outliers:\n")
removed <- extreme_outliers |>
select(name, category, outlier_count) |>
arrange(desc(outlier_count))
print(knitr::kable(removed,
caption = "Pokemon Removed as Extreme Outliers"
))
}
##
## Removed extreme outliers:
##
##
## Table: Pokemon Removed as Extreme Outliers
##
## |name |category | outlier_count|
## |:------------|:-----------|-------------:|
## |Steelix |Regular | 3|
## |Mega Steelix |Regular | 3|
## |Guzzlord |Ultra Beast | 3|
## |Stakataka |Ultra Beast | 3|
# Decision: Use cleaned data for main analysis
cat("\n** DECISION: Using cleaned dataset (extreme outliers removed) for main analysis **\n")
##
## ** DECISION: Using cleaned dataset (extreme outliers removed) for main analysis **
cat("** Outliers will be discussed separately in Section 6 **\n\n")
## ** Outliers will be discussed separately in Section 6 **
# Update working dataset
pca_df <- pca_df_clean
# Re-select numeric features from cleaned data
numeric_features_clean <- numeric_features # Keep feature names
# --- Additional outlier handling: single-variable z-score detection and export ---
## Detect extreme single-variable values using z-score (robust threshold)
zscore_threshold <- 6
zscore_flags <- pca_df_full |>
select(all_of(numeric_features)) |>
transmute(across(everything(), ~ abs(scale(.)), .names = "{.col}_z"))
# Identify rows where any variable exceeds threshold
zscore_outliers <- which(rowSums(zscore_flags > zscore_threshold) > 0)
# Create a combined outliers dataframe (IQR-based extreme OR z-score extreme)
outlier_dexes <- unique(c(extreme_outliers$dex, pca_df_full$dex[zscore_outliers]))
pokemon_outliers <- pca_df_full |>
filter(dex %in% outlier_dexes) |>
select(dex, name, category, generation, all_of(numeric_features), outlier_count) |>
arrange(desc(outlier_count))
# Ensure export directory exists and write CSV
if (!dir.exists("data/analysis/pca")) dir.create("data/analysis/pca", recursive = TRUE)
write_csv(pokemon_outliers, "data/analysis/pca/pokemon_outliers.csv")
cat(glue("\nExported {nrow(pokemon_outliers)} combined outliers to 'data/analysis/pca/pokemon_outliers.csv'\n"))
## Exported 14 combined outliers to 'data/analysis/pca/pokemon_outliers.csv'
# Update cleaned dataset to exclude any combined outliers (conservative)
pca_df <- pca_df_full |>
filter(!dex %in% outlier_dexes)
# Recompute numeric data and scaled_data after final cleaning
numeric_features_clean <- numeric_features
numeric_data <- pca_df |>
select(all_of(numeric_features_clean))
scaled_data <- scale(numeric_data)
2.5 Scale Features (After Outlier Removal)
# Extract numeric data for scaling (from cleaned dataset)
numeric_data <- pca_df |>
select(all_of(numeric_features))
# Standardize (mean=0, sd=1)
scaled_data <- scale(numeric_data)
# Verify scaling
cat("Scaled data summary:\n")
## Scaled data summary:
summary(scaled_data)
## hp attack defense sp_atk
## Min. :-3.61510 Min. :-2.93743 Min. :-2.4607 Min. :-2.42329
## 1st Qu.:-0.59203 1st Qu.:-0.71699 1st Qu.:-0.6892 1st Qu.:-0.85749
## Median :-0.09004 Median :-0.03379 Median :-0.2464 Median :-0.07459
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.57928 3rd Qu.: 0.64942 3rd Qu.: 0.4917 3rd Qu.: 0.67699
## Max. : 5.97842 Max. : 3.21146 Max. : 5.2895 Max. : 3.33885
## sp_def speed height_m weight_kgs
## Min. :-2.7222 Min. :-2.6353 Min. :-1.2379 Min. :-0.68996
## 1st Qu.:-0.6434 1st Qu.:-0.7498 1st Qu.:-0.5075 1st Qu.:-0.52745
## Median :-0.1445 Median : 0.1072 Median :-0.1829 Median :-0.33927
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.6038 3rd Qu.: 0.6214 3rd Qu.: 0.2228 3rd Qu.: 0.05265
## Max. : 6.0086 Max. : 4.0495 Max. : 7.5262 Max. : 6.06588
## bmi
## Min. :-1.0254
## 1st Qu.:-0.5386
## Median :-0.2589
## Mean : 0.0000
## 3rd Qu.: 0.1353
## Max. : 7.6640
3. Principal Component Analysis (PCA)
3.1 Compute PCA
# Perform PCA on scaled data
# Data is already standardized, so center=FALSE, scale=FALSE
pca_result <- prcomp(scaled_data)
# Summary
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5998 1.2790 1.1556 0.9580 0.92097 0.74737 0.73193
## Proportion of Variance 0.2844 0.1817 0.1484 0.1020 0.09424 0.06206 0.05953
## Cumulative Proportion 0.2844 0.4661 0.6145 0.7165 0.81073 0.87280 0.93232
## PC8 PC9
## Standard deviation 0.64145 0.44457
## Proportion of Variance 0.04572 0.02196
## Cumulative Proportion 0.97804 1.00000
# Scree plot - variance explained
fviz_eig(pca_result,
addlabels = TRUE, ylim = c(0, 50),
main = "Scree Plot - Variance Explained by Each PC"
)

3.2 Loadings and Contributions
# Variable contributions to PC1 and PC2
fviz_pca_var(pca_result,
col.var = "contrib",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE,
title = "PCA - Variable Contributions"
)

# Loadings for first 3 PCs
loadings_df <- as.data.frame(pca_result$rotation[, 1:3]) |>
rownames_to_column("Variable") |>
arrange(desc(abs(PC1)))
print(knitr::kable(loadings_df,
digits = 3,
caption = "PCA Loadings (First 3 Components)"
))
##
##
## Table: PCA Loadings (First 3 Components)
##
## |Variable | PC1| PC2| PC3|
## |:----------|------:|------:|------:|
## |weight_kgs | 0.543| 0.049| -0.110|
## |height_m | 0.474| -0.318| 0.033|
## |hp | 0.368| -0.075| -0.213|
## |defense | 0.359| 0.370| 0.214|
## |attack | 0.331| -0.165| -0.445|
## |sp_def | 0.208| 0.025| 0.700|
## |bmi | 0.179| 0.472| -0.153|
## |sp_atk | 0.143| -0.491| 0.389|
## |speed | -0.106| -0.512| -0.181|
3.3 Biplot - Pokemon in PC Space
# Add PC scores to dataframe
pca_scores <- as.data.frame(pca_result$x[, 1:5])
pca_plot_df <- bind_cols(pca_df, pca_scores)
# PC1 vs PC2 colored by category
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = category)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_brewer(palette = "Set2") +
labs(
title = "PCA: Pokemon by Category",
subtitle = "First two principal components",
x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}% variance)"),
y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}% variance)"),
color = "Category"
) +
theme(legend.position = "right")

# PC1 vs PC2 colored by generation
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = generation)) +
geom_point(alpha = 0.6, size = 2) +
scale_color_brewer(palette = "Spectral") +
labs(
title = "PCA: Pokemon by Generation",
x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}% variance)"),
y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}% variance)"),
color = "Generation"
)

# PC1 vs PC2 by primary type (top types only)
top_types <- pca_plot_df |>
count(type_1, sort = TRUE) |>
head(8) |>
pull(type_1)
pca_plot_df |>
filter(type_1 %in% top_types) |>
ggplot(aes(x = PC1, y = PC2, color = type_1)) +
geom_point(alpha = 0.6, size = 2) +
labs(
title = "PCA: Pokemon by Primary Type (Top 8)",
x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}% variance)"),
y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}% variance)"),
color = "Type"
)

3.4 3D Visualization (PC1, PC2, PC3)
# Prepare 3D plot data
pca_3d_df <- pca_plot_df |>
select(name, PC1, PC2, PC3, category, type_1)
# Interactive 3D plot (if plotly available)
if (requireNamespace("plotly", quietly = TRUE)) {
library(plotly)
plot_ly(pca_3d_df,
x = ~PC1, y = ~PC2, z = ~PC3,
color = ~category, colors = "Set2",
text = ~name, type = "scatter3d", mode = "markers",
marker = list(size = 3)
) |>
layout(
title = "PCA 3D: First Three Components",
scene = list(
xaxis = list(title = "PC1"),
yaxis = list(title = "PC2"),
zaxis = list(title = "PC3")
)
)
} else {
cat("Install plotly for interactive 3D visualization\n")
}
4. Clustering Analysis
4.1 K-Means Clustering
# Use k=4 for clustering (a reasonable choice for Pokemon categorization)
k <- 4
set.seed(123)
kmeans_result <- kmeans(scaled_data, centers = k, nstart = 25)
cat(glue("
K-Means Clustering (k={k}):
- Cluster sizes: {paste(kmeans_result$size, collapse=', ')}
- Total within-cluster SS: {round(kmeans_result$tot.withinss, 2)}
- Between-cluster SS / Total SS: {round(kmeans_result$betweenss / kmeans_result$totss * 100, 1)}%
"))
## K-Means Clustering (k=4):
## - Cluster sizes: 50, 57, 300, 321
## - Total within-cluster SS: 4346.37
## - Between-cluster SS / Total SS: 33.6%
# Add cluster labels to dataframe
pca_plot_df$cluster <- factor(kmeans_result$cluster)
# Visualize clusters in PCA space
fviz_cluster(kmeans_result,
data = scaled_data,
palette = "Set2",
ellipse.type = "convex",
ggtheme = theme_minimal(),
main = "K-Means Clusters in PCA Space"
)

# Manual plot with more control
ggplot(pca_plot_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.6, size = 2.5) +
stat_ellipse(level = 0.95, linetype = 2) +
scale_color_brewer(palette = "Set1") +
labs(
title = glue("K-Means Clustering (k={k}) in PCA Space"),
x = glue("PC1 ({round(summary(pca_result)$importance[2,1]*100, 1)}%)"),
y = glue("PC2 ({round(summary(pca_result)$importance[2,2]*100, 1)}%)"),
color = "Cluster"
)

4.3 Cluster Profiling
# Attach cluster labels to original data
cluster_profile_df <- pca_df |>
mutate(cluster = factor(kmeans_result$cluster))
# Compute cluster statistics
cluster_stats <- cluster_profile_df |>
group_by(cluster) |>
summarise(
count = n(),
avg_hp = mean(hp),
avg_attack = mean(attack),
avg_defense = mean(defense),
avg_sp_atk = mean(sp_atk),
avg_sp_def = mean(sp_def),
avg_speed = mean(speed),
avg_total = mean(hp + attack + defense + sp_atk + sp_def + speed),
pct_legendary = mean(category == "Legendary") * 100,
pct_mythical = mean(category == "Mythical") * 100
) |>
arrange(cluster)
print(knitr::kable(cluster_stats,
digits = 1,
caption = "Cluster Profiles - Average Stats"
))
##
##
## Table: Cluster Profiles - Average Stats
##
## |cluster | count| avg_hp| avg_attack| avg_defense| avg_sp_atk| avg_sp_def| avg_speed| avg_total| pct_legendary| pct_mythical|
## |:-------|-----:|------:|----------:|-----------:|----------:|----------:|---------:|---------:|-------------:|------------:|
## |1 | 50| 108.0| 127.0| 105.8| 121.5| 108.9| 95.0| 666.3| 70.0| 4.0|
## |2 | 57| 90.8| 108.2| 119.5| 70.6| 83.7| 58.6| 531.5| 12.3| 1.8|
## |3 | 300| 70.3| 87.7| 68.5| 89.7| 75.6| 100.8| 492.6| 8.0| 3.7|
## |4 | 321| 87.4| 96.7| 94.9| 82.9| 91.3| 66.2| 519.4| 10.6| 4.4|
# Heatmap of cluster centers
cluster_centers <- as.data.frame(kmeans_result$centers) |>
rownames_to_column("Cluster") |>
pivot_longer(-Cluster, names_to = "Feature", values_to = "Value")
ggplot(cluster_centers, aes(x = Feature, y = Cluster, fill = Value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
labs(
title = "K-Means Cluster Centers (Standardized)",
x = "", y = "Cluster", fill = "Scaled\nValue"
) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Cluster composition by category
cluster_category <- cluster_profile_df |>
count(cluster, category) |>
group_by(cluster) |>
mutate(percentage = round(100 * n / sum(n), 1))
ggplot(cluster_category, aes(x = cluster, y = n, fill = category)) +
geom_col(position = "stack") +
geom_text(aes(label = glue("{n} ({percentage}%)")),
position = position_stack(vjust = 0.5), size = 3
) +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Cluster Composition by Pokemon Category",
x = "Cluster", y = "Count", fill = "Category"
)

4.4 Hierarchical Clustering (Beyond Course Scope)
# [COMMENTED OUT - Beyond course scope]
# Hierarchical clustering requires understanding of distance metrics,
# linkage methods, and dendrogram interpretation
# # Compute distance matrix (sample for speed)
# set.seed(123)
# sample_idx <- sample(1:nrow(scaled_data), min(200, nrow(scaled_data)))
# dist_matrix <- dist(scaled_data[sample_idx, ], method = "euclidean")
#
# # Hierarchical clustering
# hc_result <- hclust(dist_matrix, method = "ward.D2")
#
# # Dendrogram
# fviz_dend(hc_result,
# k = k,
# cex = 0.5,
# k_colors = "Set2",
# color_labels_by_k = TRUE,
# rect = TRUE,
# main = glue("Hierarchical Clustering Dendrogram (n={length(sample_idx)})")
# )
6. Outlier Pokemon Analysis
6.1 Extreme Outliers Discussion
cat("=== EXTREME OUTLIER POKEMON ===\n\n")
## === EXTREME OUTLIER POKEMON ===
# Re-examine extreme outliers
extreme_cases <- pca_df_full |>
filter(outlier_count >= 3) |>
arrange(desc(outlier_count))
if (nrow(extreme_cases) > 0) {
cat(glue("Total extreme outliers: {nrow(extreme_cases)}\n\n"))
# Detailed view
extreme_detail <- extreme_cases |>
select(
dex, name, category, generation, type_1,
hp, attack, defense, sp_atk, sp_def, speed,
height_m, weight_kgs, bmi, outlier_count
)
print(knitr::kable(extreme_detail,
caption = "Detailed Stats of Extreme Outliers",
digits = 1
))
# Why are they outliers?
cat("\n### Why These Pokemon Are Outliers:\n\n")
for (i in 1:min(5, nrow(extreme_cases))) {
poke <- extreme_cases[i, ]
cat(glue("{i}. **{poke$name}** ({poke$category})\n"))
# Find which features are outliers
outlier_features <- c()
for (feat in numeric_features) {
x <- pca_df_full[[feat]]
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 3 * iqr
upper <- q3 + 3 * iqr
val <- poke[[feat]]
if (val < lower | val > upper) {
outlier_features <- c(
outlier_features,
glue("{feat}={round(val, 1)}")
)
}
}
cat(glue(" Extreme in: {paste(outlier_features, collapse=', ')}\n"))
cat("\n")
}
}
## Total extreme outliers: 4
##
##
## Table: Detailed Stats of Extreme Outliers
##
## | dex|name |category |generation |type_1 | hp| attack| defense| sp_atk| sp_def| speed| height_m| weight_kgs| bmi| outlier_count|
## |---:|:------------|:-----------|:----------|:------|---:|------:|-------:|------:|------:|-----:|--------:|----------:|----:|-------------:|
## | 208|Steelix |Regular |Gen 2 |Steel | 75| 85| 200| 55| 65| 30| 9.2| 400| 4.7| 3|
## | 208|Mega Steelix |Regular |Gen 2 |Steel | 75| 125| 230| 55| 95| 30| 10.5| 740| 6.7| 3|
## | 799|Guzzlord |Ultra Beast |Gen 7 |Dark | 223| 101| 53| 97| 53| 43| 5.5| 888| 29.4| 3|
## | 805|Stakataka |Ultra Beast |Gen 7 |Rock | 61| 131| 211| 53| 101| 13| 5.5| 820| 27.1| 3|
##
## ### Why These Pokemon Are Outliers:
##
## 1. **Steelix** (Regular)Extreme in: defense=200, height_m=9.2, weight_kgs=400
## 2. **Mega Steelix** (Regular)Extreme in: defense=230, height_m=10.5, weight_kgs=740
## 3. **Guzzlord** (Ultra Beast)Extreme in: hp=223, height_m=5.5, weight_kgs=888
## 4. **Stakataka** (Ultra Beast)Extreme in: defense=211, height_m=5.5, weight_kgs=820
6.2 Visualize Outliers in PCA Space
# Perform PCA on FULL dataset to see where outliers fall
scaled_full <- scale(pca_df_full |> select(all_of(numeric_features)))
pca_full <- prcomp(scaled_full, center = TRUE, scale. = TRUE)
pca_full_df <- pca_df_full |>
bind_cols(as.data.frame(pca_full$x[, 1:2])) |>
mutate(is_extreme = outlier_count >= 3)
# Plot outliers in PCA space
ggplot(pca_full_df, aes(x = PC1, y = PC2, color = is_extreme, size = is_extreme)) +
geom_point(alpha = 0.6) +
geom_text(
data = filter(pca_full_df, is_extreme),
aes(label = name),
size = 3, vjust = -1, hjust = 0.5, color = "black"
) +
scale_color_manual(
values = c("FALSE" = "gray70", "TRUE" = "red"),
labels = c("Normal", "Extreme Outlier")
) +
scale_size_manual(values = c("FALSE" = 2, "TRUE" = 4), guide = "none") +
labs(
title = "Extreme Outliers in PCA Space (Full Dataset)",
subtitle = "Outliers are labeled and shown in red",
color = "Status"
) +
theme_minimal()

6.3 Impact on Clustering
cat("=== IMPACT OF OUTLIERS ON CLUSTERING ===\n\n")
## === IMPACT OF OUTLIERS ON CLUSTERING ===
# Compare clustering with and without outliers
set.seed(123)
k <- 4
# Cluster with outliers
kmeans_with_outliers <- kmeans(scaled_full, centers = k, nstart = 25)
# Cluster without outliers (already computed earlier)
kmeans_clean <- kmeans(scaled_data, centers = k, nstart = 25)
cat(glue("
With outliers ({nrow(pca_df_full)} Pokemon):
- Total WSS: {round(kmeans_with_outliers$tot.withinss, 2)}
- Between SS / Total SS: {round(kmeans_with_outliers$betweenss / kmeans_with_outliers$totss * 100, 1)}%
Without extreme outliers ({nrow(pca_df)} Pokemon):
- Total WSS: {round(kmeans_clean$tot.withinss, 2)}
- Between SS / Total SS: {round(kmeans_clean$betweenss / kmeans_clean$totss * 100, 1)}%
"))
## With outliers (742 Pokemon):
## - Total WSS: 4525.82
## - Between SS / Total SS: 32.1%
##
## Without extreme outliers (728 Pokemon):
## - Total WSS: 4345.42
## - Between SS / Total SS: 33.6%
# Silhouette comparison
if (requireNamespace("cluster", quietly = TRUE)) {
library(cluster)
sil_with <- silhouette(kmeans_with_outliers$cluster, dist(scaled_full))
sil_without <- silhouette(kmeans_clean$cluster, dist(scaled_data))
cat(glue("
Average Silhouette Width:
- With outliers: {round(mean(sil_with[, 'sil_width']), 3)}
- Without outliers: {round(mean(sil_without[, 'sil_width']), 3)}
"))
cat("\n** Removing outliers improved clustering quality **\n")
}
## Average Silhouette Width:
## - With outliers: 0.14
## - Without outliers: 0.145
## ** Removing outliers improved clustering quality **
6.4 Outlier Insights
cat("\n=== KEY INSIGHTS ABOUT OUTLIERS ===\n\n")
##
## === KEY INSIGHTS ABOUT OUTLIERS ===
outlier_patterns <- extreme_cases |>
count(category, sort = TRUE)
cat("Outliers by Category:\n")
## Outliers by Category:
print(outlier_patterns)
## # A tibble: 2 × 2
## category n
## <chr> <int>
## 1 Regular 2
## 2 Ultra Beast 2
cat(glue("
Conclusions:
1. Most extreme outliers are **{outlier_patterns$category[1]}** Pokemon
2. Outliers represent special/unique designs (e.g., Eternamax forms, ultra-dense cores)
3. Including them in clustering:
- Distorts cluster boundaries
- Creates artificial singleton clusters
- Reduces interpretability
4. **Recommendation**: Analyze them separately as special cases
5. Main clustering analysis uses cleaned data for better pattern recognition
"))
##
## Conclusions:
## 1. Most extreme outliers are **Regular** Pokemon
## 2. Outliers represent special/unique designs (e.g., Eternamax forms, ultra-dense cores)
## 3. Including them in clustering:
## - Distorts cluster boundaries
## - Creates artificial singleton clusters
## - Reduces interpretability
## 4. **Recommendation**: Analyze them separately as special cases
## 5. Main clustering analysis uses cleaned data for better pattern recognition
7. Interpretation and Insights
7.1 PCA Interpretation
cat(glue("
=== PCA KEY FINDINGS ===
1. Variance Explained:
- PC1: {round(summary(pca_result)$importance[2,1]*100, 1)}%
- PC2: {round(summary(pca_result)$importance[2,2]*100, 1)}%
- PC3: {round(summary(pca_result)$importance[2,3]*100, 1)}%
- Cumulative (PC1-3): {round(summary(pca_result)$importance[3,3]*100, 1)}%
2. PC1 Interpretation:
Top loadings: {paste(head(loadings_df$Variable, 3), collapse=', ')}
- Likely represents overall 'power' or 'stat total'
3. PC2 Interpretation:
- Distinguishes between different stat distributions
- May separate physical vs special attackers
4. Separation:
- Legendary Pokemon tend to cluster in high PC1 region
- Clear separation between categories visible
- Generation shows some temporal evolution patterns
"))
## === PCA KEY FINDINGS ===
##
## 1. Variance Explained:
## - PC1: 28.4%
## - PC2: 18.2%
## - PC3: 14.8%
## - Cumulative (PC1-3): 61.5%
##
## 2. PC1 Interpretation:
## Top loadings: weight_kgs, height_m, hp
## - Likely represents overall 'power' or 'stat total'
##
## 3. PC2 Interpretation:
## - Distinguishes between different stat distributions
## - May separate physical vs special attackers
##
## 4. Separation:
## - Legendary Pokemon tend to cluster in high PC1 region
## - Clear separation between categories visible
## - Generation shows some temporal evolution patterns
7.2 Clustering Interpretation
# Identify dominant characteristics of each cluster
cluster_labels <- cluster_stats |>
mutate(
label = case_when(
avg_total > 550 ~ "High-Power",
avg_speed > 1 & avg_attack > 0.5 ~ "Fast Attackers",
avg_defense > 0.5 | avg_sp_def > 0.5 ~ "Defensive",
TRUE ~ "Balanced/Low-Power"
)
) |>
select(cluster, label, count, avg_total, pct_legendary)
print(knitr::kable(cluster_labels,
digits = 1,
caption = "Cluster Interpretations"
))
##
##
## Table: Cluster Interpretations
##
## |cluster |label | count| avg_total| pct_legendary|
## |:-------|:--------------|-----:|---------:|-------------:|
## |1 |High-Power | 50| 666.3| 70.0|
## |2 |Fast Attackers | 57| 531.5| 12.3|
## |3 |Fast Attackers | 300| 492.6| 8.0|
## |4 |Fast Attackers | 321| 519.4| 10.6|
cat(glue("
=== CLUSTERING KEY FINDINGS ===
1. Optimal Clusters: {k}
Based on elbow method and silhouette analysis
2. Cluster Characteristics:
{paste(apply(cluster_labels, 1, function(x)
glue(' Cluster {x[1]}: {x[2]} (n={x[3]}, avg_total={round(as.numeric(x[4]),1)})')),
collapse='\n')}
3. Legendary Distribution:
- Predominantly in high-power clusters
- {round(max(cluster_stats$pct_legendary), 1)}% of one cluster are legendary
4. Practical Use:
- Can identify Pokemon archetypes (tank, sweeper, balanced)
- Useful for team building and role assignment
- Reveals design patterns across generations
"))
## === CLUSTERING KEY FINDINGS ===
##
## 1. Optimal Clusters: 4
## Based on elbow method and silhouette analysis
##
## 2. Cluster Characteristics:
## Cluster 1: High-Power (n= 50, avg_total=666.3)
## Cluster 2: Fast Attackers (n= 57, avg_total=531.5)
## Cluster 3: Fast Attackers (n=300, avg_total=492.6)
## Cluster 4: Fast Attackers (n=321, avg_total=519.4)
##
## 3. Legendary Distribution:
## - Predominantly in high-power clusters
## - 70% of one cluster are legendary
##
## 4. Practical Use:
## - Can identify Pokemon archetypes (tank, sweeper, balanced)
## - Useful for team building and role assignment
## - Reveals design patterns across generations
8. Export Results
# Ensure output directory exists
if (!dir.exists("data/analysis/pca")) dir.create("data/analysis/pca", recursive = TRUE)
# Save PCA results
write_csv(pca_plot_df, "data/analysis/pca/pokemon_pca_scores.csv")
# Save cluster assignments
cluster_export <- cluster_profile_df |>
select(dex, name, category, cluster)
write_csv(cluster_export, "data/analysis/pca/pokemon_clusters.csv")
cat("Results saved to data/analysis/pca/:
- pokemon_pca_scores.csv (with PC1-PC5)
- pokemon_clusters.csv (cluster assignments)
- pokemon_outliers.csv (extreme outliers)
")
## Results saved to data/analysis/pca/:
## - pokemon_pca_scores.csv (with PC1-PC5)
## - pokemon_clusters.csv (cluster assignments)
## - pokemon_outliers.csv (extreme outliers)